Amandine Lecerf Defer
# if problem
# !pip3 install jupyter
# !ipython kernel install --name=python3 --user
# in terminal
# python -m pip install ipykernel
# python -m ipykernel install --user
!pip install missingno
!pip install termcolor
!pip install shapely
!pip install geopandas
!pip install wordcloud
# import packages
import missingno as msno
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import math
from termcolor import colored
from datetime import datetime
from datetime import timedelta
import scipy.stats as stats
from scipy.stats.stats import pearsonr
from scipy import stats
from scipy.stats import variation
import collections
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.width = 0
#Â Installation :
!pip install flake8 pycodestyle pycodestyle_magic
# Activate %%flake8 magic function
%load_ext pycodestyle_magic
# module to know if the text is with pep8 reglementation
# !pip install pycodestyle flake8
# !pip install pycodestyle_magic
# %load_ext pycodestyle_magic
# automatic search for the entire document : %pycodestyle_on or %flake8_on to turn off : %pycodestyle_off or %flake8_off
# for each cell : %%pycodestyle or %%flake8
# %%flake8 block showing output and to see ouptut, put %%flake8 in comment
# %%flake8
def informations(dataframe):
"""This function gives the general information of a dataset.
It returns the number of rows and columns of the dataset.
dataframe : dataset"""
print(colored("\n Overview of the dataset : \n", 'red'))
print(dataframe.head())
lines = dataframe.shape[0]
columns = dataframe.shape[1]
print(colored("The dataset has {} rows and {} "
"columns. \n \n".format(lines, columns), 'blue'))
print(colored("Column's name : \n", 'green'))
print(dataframe.columns)
print("\n")
print(colored("Column's Type : \n", 'green'))
print(dataframe.dtypes)
print("\n \n")
# %%flake8
def pie_NaN(dataframe, size):
"""This function allows to make a pie plot showing the
proportion of missing data on the whole dataset.
dataframe : dataset
size : size of the figure (X,X)"""
lines = dataframe.shape[0]
columns = dataframe.shape[1]
# NAN data
nb_data = dataframe.count().sum()
# Total data = (colonnes*lignes)
nb_totale = (columns*lines)
# Filling rate
rate_dataOK = (nb_data/nb_totale)
print("The data set is filled in at {:.2%}".format(rate_dataOK))
print("and it has {:.2%} of missing data".format(1-rate_dataOK))
print("\n \n ")
# Pie Plot
rates = [rate_dataOK, 1 - rate_dataOK]
labels = ["Données", "NAN"]
explode = (0, 0.1)
colors = ['gold', 'pink']
# Plot
plt.figure(figsize=size)
plt.pie(rates, explode=explode, labels=labels, colors=colors,
autopct='%.2f%%', shadow=True, textprops={'fontsize': 26})
ttl = plt.title("Fill rate of the dataset", fontsize=32)
ttl.set_position([0.5, 0.85])
plt.axis('equal')
# ax.legend(labels, loc = "upper right", fontsize = 18)
plt.tight_layout()
plt.show()
# %%flake8
def graph_fill_variable(dataframe, size):
"""This function allows to make barplot to show the distribution
of the data between the different columns of the dataset.
dataframe : dataset
size : size of the figure (X, X)
"""
df = pd.DataFrame(round(
dataframe.shape[0] - dataframe.isna().sum())/dataframe.shape[0],
columns=['Filling rate']).sort_values(
'Filling rate', ascending=False).reset_index()
fig, ax = plt.subplots(figsize=size)
sns.barplot(y=df['index'], x='Filling rate', data=df)
plt.title('Fill rate of the variables - %', fontsize=30)
plt.show()
# %%flake8
def duplicate(dataframe, depend_on):
"""This function allows to delete duplicated elements in the dataset.
dataframe : dataset
depend_on : Variable used to search for duplicates."""
before = len(dataframe)
dataframe.drop_duplicates(inplace=True, subset=[depend_on])
after = len(dataframe)
print("There are {} duplicates that have been removed from the dataset"
.format(before-after))
return dataframe
# %%flake8
def plot_multiple_histograms(dataframe, size):
"""This function allows you to see graphically
the shape of numerical variables by representing their density.
dataframe : dataser
size : size of the figure (x,x)"""
cols = dataframe.select_dtypes(include=
'number').columns.tolist()
if cols is None:
print("There are no numeric columns.")
pass
else :
num_plots = len(cols)
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
fig, axs = plt.subplots(num_rows, num_cols, figsize=size)
for ind, col in enumerate(cols):
i = math.floor(ind/num_cols)
j = ind - i*num_cols
if num_rows == 1:
if num_cols == 1:
sns.distplot(dataframe[col], kde=True, ax=axs)
else:
sns.distplot(dataframe[col], kde=True, ax=axs[j])
else:
sns.distplot(dataframe[col], kde=True, ax=axs[i, j])
# %%flake8
def description_var(dataframe):
"""This function allows the statistical analysis of
numerical variables and the creation of an associated boxplot.
"""
df = dataframe.select_dtypes(include='number')
for col in df:
colData = df[col]
mean = np.mean(colData)
median = np.median(colData)
Q1 = np.percentile(colData, 25)
Q3 = np.percentile(colData, 75)
max = colData.max()
min = colData.min()
variance = np.var(colData)
standard_deviation = np.std(colData)
skew = pd.DataFrame(colData).skew()[0]
kurt = pd.DataFrame(colData).kurtosis()[0]
print("Statistical measures for the variable {} \n" .format(col))
print("The average of the variable {} is : {}".format(
col, round(mean, 2)))
print("The median of the variable {} is : {}".format(
col, round(median, 2)))
print("Quartile Q1 is : {} ".format(round(Q1, 2)))
print("Quartile Q3 is : : {} ".format(round(Q3, 2)))
print("The maximum is : {} ".format(max))
print("The minimum is : {} \n \n".format(min))
print("Measures of dispersion for the variable {} \n" .format(col))
print("The variance of the variable {} is : {} " .format(
col, round(variance, 2)))
print("The standard deviation of the variable {} is : {} " .format(
col, round(standard_deviation, 2)))
print("The coefficient of variation of the variable {} is : {}"
.format(col, round(variation(colData), 2)))
print("The interquartile range of the variable {} is : {} \n \n"
.format(col, round(Q3-Q1, 2)))
print("Shape measures for the variable {}\n" .format(col))
print("The empirical skewness for the variable {} is {} " .format(
col, round(skew, 4)))
if (skew == 0):
print("The distribution of the variable {} is symmetric.\n"
.format(col))
elif (skew > 0):
print("The distribution of the variable {} is spread"
"to the right. \n".format(col))
else:
print("The distribution of the variable {} is spread"
"to the left.\n" .format(col))
print("The empirical Kurtosis for the variable {} is {} " .format(
col, round(kurt, 4)))
if kurt == 0:
print("The distribution of the variable {} has the same smoothness"
"as the normal distribution.\n" .format(col))
elif kurt > 0:
print("The distribution of the variable {} is less flat"
"than the normal distribution, the observations"
"are more concentrated.\n"
.format(col))
else:
print("The distribution of the variable {} is flatter"
" than the normal distribution, the observations"
"are less concentrated.\n"
.format(col))
plt.figure(figsize=(10, 8))
df[col].hist(color='pink', edgecolor='red', log=True, )
plt.title("Statistical representation of the variable {}"
"\n".format(col))
plt.show()
print("Boxplot of the variable {}".format(col))
plt.figure(figsize=(6, 6))
df.boxplot(column=[col], return_type='axes', vert=True,
showfliers=False, showcaps=True, patch_artist=True,
color='tan', medianprops={'linestyle': '-',
'linewidth': 2, 'color': 'red'},
whiskerprops={'linestyle': '-', 'linewidth': 2,
'color': 'blue'},
capprops={'linestyle': '-', 'linewidth': 2,
'color': 'blue'})
plt.show()
print("\n \n")
# %%flake8
def verif_normalite(dataframe):
"""This function allows to check the normality of numerical variables.
"""
numeric_columns = dataframe.select_dtypes(
include=['int64', 'float64']).columns
for column in numeric_columns:
print('__\n{}'.format(column))
k2, p = stats.normaltest(dataframe[column],
axis=0,
nan_policy='omit')
alpha = 5e-2
print("p = {:g}".format(p))
if p < alpha: # null hypothese: x as a normal distribution
print(
"H0 is rejected : {} is not normally distributed"
.format(column))
else:
print(
"H0 cannot be rejected:{} is normally distributed"
.format(column))
# %%flake8
def delete_outliers_IQR(dataframe, n):
"""Removal of outliers by the Interquartile method for variables
following a normal distribution.
dataframe : dataset
n = number of extreme values to be taken
into account to consider that a line is outlier."""
indexes_outliers = []
dd = df_in.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col_name in dd:
print('colonne ', col_name)
Q1 = dataframe[col_name].quantile(0.25)
Q3 = dataframe[col_name].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit
list_outliers = dataframe[(dataframe[col_name] < lower_limit) |
(dataframe[col_name] > upper_limit)].index
indexes_outliers.extend(list_outliers)
indexes_outliers = Counter(indexes_outliers)
multiple_outliers = list(k for k, v in indexes_outliers.items() if v > n)
print(" ")
print("remove {} outliers" .format(len(multiple_outliers)))
dataframe.drop(multiple_outliers, axis=0,
inplace=True).reset_index(drop=True)
# %%flake8
def delete_outliers_percentiles(dataframe):
'''Removal of extreme percentile outliers.
dataframe : dataset'''
numeric_columns = dataframe.select_dtypes(include=
'number').columns.tolist()
if numeric_columns:
for column in numeric_columns :
dataframe.loc[dataframe[column] > dataframe[column].quantile(0.995)] = np.nan
dataframe.loc[dataframe[column] < dataframe[column].quantile(0.005)] = np.nan
dataframe.loc[dataframe[column] < 0] = np.nan
dataframe.dropna(inplace=True)
# %%flake8
def graphe_categories(dataframe, size):
"""This function represents the categorical
variables as a pie plot and histogram.
dataframe : dataset
size : size of the figure (X,X)"""
object_df = dataframe.select_dtypes(include=['category'])
columns_object_df = list(object_df)
for col in columns_object_df :
values = dataframe[col].value_counts()
labels = dataframe[col].value_counts().index
# Pie Plot
plt.figure(figsize=size)
plt.title("Représentation de la variable {}" .format(col), fontsize = 24)
plt.pie(values, labels=labels,
autopct='%.1f%%', shadow=True, textprops={'fontsize': 20})
#ttl.set_position([0.5, 1.05])
plt.axis('equal')
plt.tight_layout()
plt.show()
print("\n \n \n")
# Plot
plt.figure(figsize=size)
ax=sns.countplot(x = dataframe[col])
plt.title("Représentation de la variable {}" .format(col), fontsize = 24)
plt.rc('xtick', labelsize=14)
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, ha="right")
plt.tight_layout()
plt.show()
print("\n \n \n")
# %%flake8
def pie_col_category(dataframe, col, size):
"""This function represents the categorical variables as a pie plot.
dataframe : dataset
size : size of the figure (X,X)"""
values = dataframe[col].value_counts()
labels = dataframe[col].value_counts().index
# Plot
plt.figure(figsize=size)
plt.title("Représentation de la variable {}" .format(
col), fontsize=20)
plt.pie(values, labels=labels,
autopct='%.1f%%', shadow=True, textprops={'fontsize': 20})
plt.axis('equal')
plt.tight_layout()
plt.show()
print("\n \n \n")
# %%flake8
def hist_col_categories(dataframe, col, size):
"""This function represents the categorical variables as a histogram.
dataframe : dataset
size : size of the figure (X,X)"""
# Plot
plt.figure(figsize=size)
ax = sns.countplot(x=dataframe[col],
order=dataframe[col].value_counts().index)
plt.title("Representation of the variable {}" .format(col), fontsize=24)
plt.rc('xtick', labelsize=14)
ax.set_xticklabels(ax.get_xticklabels(), rotation=70, ha="right")
plt.tight_layout()
plt.show()
print("\n \n \n")
# %%flake8
def graphe_objet(dataframe):
"""This function allows the creation of
WordCloud and histograms
for the most common occurrences.
dataframe : dataset"""
from wordcloud import WordCloud, STOPWORDS
object_df = dataframe.select_dtypes(include=['object'])
columns_object_df = list(object_df)
for col in columns_object_df :
counts = dataframe[col].value_counts()
counts.index = counts.index.map(str)
wordcloud = WordCloud(
background_color='white',
stopwords=STOPWORDS,
max_words=200,
max_font_size=40,
scale=3,
random_state=5 # chosen at random by flipping a coin; it was heads
).generate_from_frequencies(counts)
fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
fig.suptitle("WordCloud of the variable {}" .format(col), fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
print("\n \n")
count = dataframe[col].value_counts()
count = count[:20,]
plt.figure(figsize=(15,10))
sns.set(style="whitegrid")
sns.barplot(count.values, count.index, alpha=0.8, orient = 'h')
plt.title("Top 20 number of occurrences for the variable {}" .format(col))
plt.xlabel('Number of Occurrences', fontsize=12)
plt.show()
print("\n \n \n \n \n")
# %%flake8
def plot_date_dist(df, feature_w_date):
"""plot the distribution of one feature date
df : dataframe
feature_w_date : Date column to plot"""
df_tmp = pd.DataFrame(pd.DatetimeIndex(df[feature_w_date]).to_period('M'))
tmp = df_tmp[feature_w_date].value_counts()
df_tmp = pd.DataFrame({
feature_w_date : tmp.index,
'Quantity': tmp.values
})
df_tmp = df_tmp.sort_values(by=feature_w_date)
fig, ax = plt.subplots(figsize=(12,6))
sns.set_color_codes("pastel")
s = sns.barplot(ax = ax,
x = feature_w_date,
y = 'Quantity',
data = df_tmp)
plt.xticks(rotation=45)
plt.show();
# %%flake8
def plot_hist_stats(df, feature, title=None, verticale=False):
"""Displays the histogram of a characteristic
df : dataframe
feature : column to plot
title : title of the plot
verticale : verticalization of xtick"""
tmp = df[feature].value_counts()
df_tmp = pd.DataFrame({
feature: tmp.index,
'Quantity': tmp.values
})
fig, ax = plt.subplots(figsize=(15, 6))
sns.set_color_codes("pastel")
s = sns.barplot(ax=ax,
x=feature,
y='Quantity',
data=df_tmp)
if title != None:
plt.title(title)
if verticale == True:
plt.xticks(rotation=90)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.show();
# %%flake8
def filtration_columns(dataframe, filling_rate):
"""filtration of the columns of a dataframe according to a predefined filling rate of the columns
dataframe : Dataframe to be filtered
filling_rate : filling rate"""
dataframe = dataframe.replace(to_replace='^nan$',
value=np.nan, regex=True)
indexes = []
# number of columns at the origin
nb_colonne = dataframe.shape[1]
for col_name in dataframe.columns.tolist():
rate_data_column = (dataframe[col_name].shape[0] - dataframe[col_name]
.isna().sum())/dataframe[col_name].shape[0]
# fill rate for each column :
# (numberofline-numberofNANspercolumn)/numberofline
if rate_data_column < filling_rate:
# Filtration of the columns whose filling is higher than the limit
dataframe.drop(col_name, axis=1, inplace=True)
# nombre de colonnes au final
nb_column_supp = nb_colonne - dataframe.shape[1]
if nb_column_supp == 0:
print("All the columns have a filling rate higher than {:.2%}"
.format(filling_rate))
else :
print("Number of columns with a fill rate greater than {:.2%}: "
"{} columns."
.format(filling_rate, dataframe.shape[1]))
print("Number of deleted columns: {} columns".format(nb_column_supp))
print("\n")
return dataframe
# %%flake8
def scatter_plot(dataframe_point, colX, colY, dataframe_reg, hue=None):
"""This function shows graphically the relationship
between two variables by creating a scatter plot.
dataframe_point : dataframe to make plot
colX : variable plotting on x axis
colY : variable plotting on y axis
dataframe_reg : dataframe to make regression
hue : plot color by a specific variable by default None"""
print("The Pearson coefficient is {:.2f}".format(np.corrcoef(dataframe_point[colX],
dataframe_point[colY])[0][1]))
sns.set(style="darkgrid", color_codes=True)
sns.set(rc={'figure.figsize':(15,10)})
r, p = stats.pearsonr(dataframe_reg[colX], dataframe_reg[colY])
slope, intercept, r_value, p_value, std_err = stats.linregress(
dataframe_reg[colX], dataframe_reg[colY])
ax = sns.scatterplot(data=dataframe_point, x=colX, y=colY, hue=hue)
p=sns.regplot(data=dataframe_reg, x=colX, y=colY, scatter=False,
ci=68, ax=ax, line_kws={'color':'red'})
p.annotate(f'y={slope:.2f}x+{intercept:.1f}\n R2={r*r:.4f}',
xy=(0.1, 0.9), xycoords='axes fraction',
ha='left', va='center',
color='black',
fontweight="bold",
bbox={'boxstyle': 'round', 'fc': 'pink', 'ec': 'purple'})
plt.title("Relationship between the variable {} and the variable {}".format(colX, colY),
fontsize=18, fontweight="bold")
plt.tight_layout()
plt.show()
# %%flake8
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/Data_projet_OC
!ls
customers = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_customers_dataset.csv')
geolocalisation = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_geolocation_dataset.csv')
order_items = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_order_items_dataset.csv')
order_payments = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_order_reviews_dataset.csv')
orders = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_orders_dataset.csv')
products = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_products_dataset.csv')
sellers = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'olist_sellers_dataset.csv')
product_category = pd.read_csv('/content/drive/MyDrive/Data_projet_OC/'
'product_category_name_translation.csv')
"""# %%flake8
customers = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_customers_dataset.csv")
geolocalisation = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_geolocation_dataset.csv")
order_items = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_order_items_dataset.csv")
order_payments = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_order_payments_dataset.csv")
order_reviews = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_order_reviews_dataset.csv")
orders = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_orders_dataset.csv")
products = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_products_dataset.csv")
sellers = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"olist_sellers_dataset.csv")
product_category = pd.read_csv("/Users/amandinelecerfdefer/Desktop/"
"Formation_Data_Scientist_OC/WORK-projet5/Data/"
"product_category_name_translation.csv")"""
# %%flake8
datasets = [customers, geolocalisation, order_items, order_payments,
order_reviews, orders, products, sellers, product_category]
# %%flake8
list_names = ['customers', 'geolocalisation', 'order_items', 'order_payments',
'order_reviews', 'orders', 'products', 'sellers',
'product_category']
# %%flake8
resume = []
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("General presentation of the dataset {}." .format(name))
informations(dataset)
pie_NaN(dataset, (10, 10))
resume.append(
{
'Dataset': name,
'Lines': dataset.shape[0],
'Columns': dataset.shape[1]
}
)
pd.DataFrame(resume)
The Customers dataset gathers information about a customer. It contains his identifier, a unique identifier, the zip code of the place where the buyer lives, his city and his country.
customer_id : key to the orders dataset. Each order has a unique customer_id.
customer_unique_id: unique identifier of a customer.
customer_zip_code_prefix: first five digits of customer zip code
customer_city customer city name
customer_state customer state
The Geolocalisation dataset allows the correspondence between customers custom_state, zip code and city name and sellers. It can be linked to customers by the link that exists between the two datasets and especially the columns customer_zip_code_prefix and geolocation_zip_code_prefix.
geolocation_zip_code_prefix : first 5 digits of zip code
geolocation_lat : latitude
geolocation_lng : longitude
geolocation_city : city name
geolocation_state : state
The orders item dataset gives information on the different commands (id order, number of product, id of product, seller id, limit shipping date, total price and expedition tax).
order_id : order unique identifier
order_item_id : sequential number identifying number of items included in the same order.
product_id : product unique identifier
seller_id : seller unique identifier
shipping_limit_date : Shows the seller shipping limit date for handling the order over to the logistic partner.
price : item price
freight_value : item freight value item (if an order has more than one item the freight value is splitted between items)
Order Payment : Information about the payment the customer has made for an order (id, number of part for payment, payment type, value).
order_id : unique identifier of an order.
payment_sequential : a customer may pay an order with more than one payment method. If he does so, a sequence will be created to accommodate all payments.
payment_type : method of payment chosen by the customer.
payment_installments : number of installments chosen by the customer.
payment_value : transaction value.
Order review : This dataset contains the evaluations that each customer makes per order (id, order id, score out of 5, title, comment, creation date, answer date).
review_id : unique review identifier
order_id unique order identifier
review_score : Note ranging from 1 to 5 given by the customer on a satisfaction survey.
review_comment_title : Comment title from the review left by the customer, in Portuguese.
review_comment_message : Comment message from the review left by the customer, in Portuguese.
review_creation_date : Shows the date in which the satisfaction survey was sent to the customer.
review_answer_timestamp : Shows satisfaction survey answer timestamp
The Order dataset contains information about the orders (id, customer id, status, purchase date, approval date, delivered date (carrier & customers), delivery date). It can be linked to customers by the link that exists between the two datasets and especially the customers_id and it can be linked to order_review, order_payment and order_item by the link that exists between datasets and especially order_id.
order_id : unique identifier of the order.
customer_id : key to the customer dataset. Each order has a unique customer_id.
order_status : Reference to the order status (delivered, shipped, etc).
order_purchase_timestamp : Shows the purchase timestamp.
order_approved_at : Shows the payment approval timestamp.
order_delivered_carrier_date : Shows the order posting timestamp. When it was handled to the logistic partner.
order_delivered_customer_date : Shows the actual order delivery date to the customer.
order_estimated_delivery_date : Shows the estimated delivery date that was informed to customer at the purchase moment.
The product dataset contains information about the product (id, category, lenght of product name and description, number of photos and dimensions of the product). It can be liked to order_item by the product id.
product_id : unique product identifier
product_category_name : root category of product, in Portuguese.
product_name_lenght : number of characters extracted from the product name.
product_description_lenght : number of characters extracted from the product description.
product_photos_qty : number of product published photos
product_weight_g : product weight measured in grams.
product_length_cm : product length measured in centimeters.
product_height_cm : product height measured in centimeters.
product_width_cm : product width measured in centimeters.
The seller dataset contains information about the sellers (id, zip code, city and state). It can be liked to order_item by the seller id and with geolocalisation by zip code prefix.
seller_id : seller unique identifier
seller_zip_code_prefix : first 5 digits of seller zip code
seller_city : seller city name
seller_state : seller state
The product category table allows the conversion of product names into English.
product_category_name : category name in Portugues
product_category_name_english : category name in English
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("Shape of the dataset {}." .format(name))
print(dataset.shape)
print("\n")
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
lines = dataset.shape[0]
columns = dataset.shape[1]
nb_data = dataset.count().sum()
nb_totale = (columns*lines)
rate_dataOK = (nb_data/nb_totale)
print("NAN of the dataset {}." .format(name))
print("There is {:.2%} of missing data".format(1-rate_dataOK))
print("\n")
There are missing data for :
order_reviews.isna().sum(axis=0)
Missing data correspond to empty titles and messages.
orders.isna().sum(axis=0)
For orders, the missing data is increasing in the order of an order process and corresponds to the problem orders.
orders.max()
products.isna().sum(axis=0)
For products, there is missing data for:
products[products['product_category_name'].isna()].sample(5)
We have products for which important information is missing (especially the product category).
products[products['product_weight_g'].isna()]
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("Duplicate of the dataset {}." .format(name))
print(dataset.duplicated().sum())
print("\n")
There are duplicates only for the geolocation dataset.
list_df = []
list_column = []
list_type = []
list_uniq_val = []
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
for column in dataset.columns:
list_df.append(name)
list_column.append(column)
list_type.append(dataset[column].dtype)
list_uniq_val.append(dataset[column].nunique())
resume_data = pd.DataFrame([list_df,
list_column,
list_type,
list_uniq_val]
).T
resume_data.columns=['dataset', 'column', 'Type', 'list_uniq_val']
resume_data
customers :
there are more unique values for customer_id than for customer_unique_id (assumption: customer_unique_id corresponds to the number of customers and customer_id corresponds to the customer id for each order. Thus, customers could make several orders with the same unique id but with different customer_id for different orders).
The customers zip_code should not be an int but an object.
geolocation
order_items
order_payments :
order_review :
orders :
sellers:
product_category :
# %%flake8
geolocalisation.duplicated().sum()
# %%flake8
geolocalisation.loc[geolocalisation['geolocation_zip_code_prefix'] == 1046]
We can see that for the same postal code, there are several locations with different latitudes and longitudes. To homogenize the latitudes and longitudes for each city, we can take for each city the average of the latitudes and longitudes.
# %%flake8
geolocalisation['geolocation_lat_means'] = geolocalisation\
.groupby(by='geolocation_zip_code_prefix')['geolocation_lat']\
.transform('mean')
# %%flake8
geolocalisation['geolocation_lng_means'] = geolocalisation\
.groupby(by='geolocation_zip_code_prefix')['geolocation_lng']\
.transform('mean')
# %%flake8
geolocalisation
# %%flake8
duplicate(geolocalisation, 'geolocation_zip_code_prefix')
There were 981 148 duplicate elements.
There are missing data in review comment title and message. It is therefore necessary to charge them.
# %%flake8
order_reviews['review_comment_title'].fillna(' ', inplace=True)
order_reviews['review_comment_message'].fillna(' ', inplace=True)
There are missing data in approval date, delivered carrier date and delivered customer date. It is therefore necessary to charge them.
# %%flake8
orders['order_approved_at'].fillna(0, inplace=True)
orders['order_delivered_carrier_date'].fillna(0, inplace=True)
orders['order_delivered_customer_date'].fillna(0, inplace=True)
There is missing data for this dataset in category name, lenght name and description, number of photos and product's dimensions.
# %%flake8
products['product_category_name'].fillna('Unkwown', inplace=True)
products['product_name_lenght'].fillna(0, inplace=True)
products['product_description_lenght'].fillna(0, inplace=True)
products['product_photos_qty'].fillna(0, inplace=True)
products['product_weight_g'].fillna(0, inplace=True)
products['product_length_cm'].fillna(0, inplace=True)
products['product_height_cm'].fillna(0, inplace=True)
products['product_width_cm'].fillna(0, inplace=True)
# %%flake8
order_payments["payment_installments"].unique()
#%%flake8
visualization_payment = order_payments.copy()
# %%flake8
visualization_payment.loc[(order_payments["payment_installments"] == 0), "payment_installments"] = 1
# %%flake8
visualization_payment.loc[(visualization_payment["payment_installments"] > 5) & (
visualization_payment["payment_installments"] <= 10),
"payment_installments"] = 1000
# %%flake8
visualization_payment.loc[(visualization_payment["payment_installments"] > 10) & (
visualization_payment["payment_installments"] < 50),
"payment_installments"] = 2000
# %%flake8
visualization_payment["payment_installments"].unique()
# %%flake8
visualization_payment['payment_installments'] = visualization_payment[
'payment_installments'].replace(
[1000, 2000], ['between_5&10', 'over_10'])
# %%flake8
visualization_payment["payment_installments"].unique()
# %%flake8
visualization_payment['payment_sequential'].value_counts().sort_index(ascending=True)
# %%flake8
visualization_payment["payment_sequential"].unique()
# %%flake8
visualization_payment.loc[(order_payments["payment_sequential"] > 3),
"payment_sequential"] = 45
# %%flake8
visualization_payment["payment_sequential"].unique()
# %%flake8
visualization_payment['payment_sequential'] = visualization_payment[
'payment_sequential'].replace(
[45], ['over_3'])
# %%flake8
visualization_payment["payment_sequential"].unique()
# %%flake8
orders['order_purchase_timestamp'] = orders[
'order_purchase_timestamp'].astype('datetime64')
orders['order_approved_at'] = orders[
'order_approved_at'].astype('datetime64')
orders['order_delivered_carrier_date'] = orders[
'order_delivered_carrier_date'].astype('datetime64')
orders['order_delivered_customer_date'] = orders[
'order_delivered_customer_date'].astype('datetime64')
orders['order_estimated_delivery_date'] = orders[
'order_estimated_delivery_date'].astype('datetime64')
order_reviews['review_creation_date'] = order_reviews[
'review_creation_date'].astype('datetime64')
order_reviews['review_answer_timestamp'] = order_reviews[
'review_answer_timestamp'].astype('datetime64')
order_items['shipping_limit_date'] = order_items[
'shipping_limit_date'].astype('datetime64')
order_payments['payment_value'] = order_payments[
'payment_value'].astype('float64')
customers['customer_zip_code_prefix'] = customers[
'customer_zip_code_prefix'].astype(
'object')
sellers['seller_zip_code_prefix'] = sellers['seller_zip_code_prefix'].astype(
'object')
customers['customer_zip_code_prefix'] = customers[
'customer_zip_code_prefix'].astype('object')
sellers['seller_zip_code_prefix'] = sellers[
'seller_zip_code_prefix'].astype('object')
order_items['order_item_id'] = order_items['order_item_id'].astype('object')
geolocalisation["geolocation_zip_code_prefix"] = geolocalisation[
"geolocation_zip_code_prefix"].astype('object')
order_reviews['review_score'] = order_reviews[
'review_score'].astype('int64')
# %%flake8
indexNames = order_reviews[order_reviews['review_score'] < 1].index
indexNames1 = order_reviews[order_reviews['review_score'] > 5].index
indexNames = indexNames.append(indexNames1)
order_reviews.drop(indexNames, inplace=True)
order_reviews['review_score'] = order_reviews['review_score'].astype('object')
datasets = [order_items, order_payments, products]
list_names = ['order_items', 'order_payments',
'products']
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print(colored("\n Boxplot of the dataset {}: \n".format(name), 'red'))
columns = data.select_dtypes(include='number').columns
for col in columns :
print(colored("\n Boxplot of the columns {}: \n".format(col), 'green'))
plt.boxplot(data[col])
plt.show()
"""I decide not to remove any more outliers so as not to exclude potential large orders.
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("Verification of the normality of the numerical columns of the dataset: {}." .format(name))
verif_normalite(dataset)
print("\n")
print("Verification of the normality of the numerical"
"columns of the dataset: {}." .format("order_items"))
print(plot_multiple_histograms(order_items, (15, 5)))
print("Verification of the normality of the numerical"
"columns of the dataset: {}." .format("order_payments"))
print(plot_multiple_histograms(order_payments, (15, 5)))
print("Verification of the normality of the numerical"
"columns of the dataset: {}." .format("products"))
print(plot_multiple_histograms(products, (15, 15)))
The variables don't have a Normal distribution so the Outliers are removed by the Percentile method.
outliers_datasets = [order_items, order_payments, products]
outliers_list_names = ['order_items', 'order_payments', 'products']
for name in outliers_list_names:
pos = outliers_list_names.index(name)
dataset = outliers_datasets[pos]
print("Delete outliers: {}." .format(name))
delete_outliers_percentiles(dataset)
print("OK")
print("\n")
print("Histogramme of the numerical"
"columns of the dataset: {}." .format("order_items"))
print(plot_multiple_histograms(order_items, (15, 5)))
print("Histogramme of the numerical"
"columns of the dataset: {}." .format("order_payments"))
print(plot_multiple_histograms(order_payments, (15, 5)))
print("Histogramme of the numerical"
"columns of the dataset: {}." .format("products"))
print(plot_multiple_histograms(products, (15, 15)))"""
dict_categories = {
#hygiene_health_wellness
'health_beauty' : 'hygiene_health_wellness',
'baby': 'hygiene_health_wellness',
'diapers_and_hygiene' : 'hygiene_health_wellness',
'perfumery' : 'hygiene_health_wellness',
#furniture_decoration_home
'furniture_living_room' : 'furniture_decoration_home',
'furniture_mattress_and_upholstery' : 'furniture_decoration_home',
'furniture_bedroom' : 'furniture_decoration_home',
'furniture_decor' : 'furniture_decoration_home',
'bed_bath_table' : 'furniture_decoration_home',
'kitchen_dining_laundry_garden_furniture' : 'furniture_decoration_home',
'la_cuisine' : 'furniture_decoration_home',
'home_confort' : 'furniture_decoration_home',
'home_comfort_2' : 'furniture_decoration_home',
'christmas_supplies' : 'furniture_decoration_home',
#repairs_constructions
'construction_tools_construction': 'repairs_constructions',
'costruction_tools_garden': 'repairs_constructions',
'costruction_tools_tools': 'repairs_constructions',
'construction_tools_safety': 'repairs_constructions',
'construction_tools_lights': 'repairs_constructions',
'home_construction': 'repairs_constructions',
'air_conditioning': 'repairs_constructions',
'auto' : 'repairs_constructions',
#Office_equipment_furniture
'office_furniture' : 'Office_equipment_furniture',
'industry_commerce_and_business' : 'Office_equipment_furniture',
'stationery' : 'Office_equipment_furniture',
'agro_industry_and_commerce' : 'Office_equipment_furniture',
'signaling_and_security' : 'Office_equipment_furniture',
'furnitures' : 'Office_equipment_furniture',
'security_and_services' : 'Office_equipment_furniture',
#food_drink
'food_drink' : 'food_drink',
'food' : 'food_drink',
'drinks' : 'food_drink',
#electronic_equipment
'telephony' : 'electronic_equipment',
'electronics': 'electronic_equipment',
'computers_accessories': 'electronic_equipment',
'consoles_games': 'electronic_equipment',
'fixed_telephony': 'electronic_equipment',
'audio': 'electronic_equipment',
'computers': 'electronic_equipment',
'tablets_printing_image': 'electronic_equipment',
#home_appliances
'small_appliances' : 'home_appliances',
'small_appliances_home_oven_and_coffee' : 'home_appliances',
'home_appliances_2' : 'home_appliances',
'home_appliances' : 'home_appliances',
'housewares' : 'home_appliances',
#clothing_accessories
'watches_gifts' : 'clothing_accessories',
'fashion_bags_accessories' : 'clothing_accessories',
'fashion_underwear_beach' : 'clothing_accessories',
'fashion_shoes' : 'clothing_accessories',
'fashion_male_clothing' : 'clothing_accessories',
'fashio_female_clothing' : 'clothing_accessories',
'fashion_sport' : 'clothing_accessories',
'fashion_childrens_clothes' : 'clothing_accessories',
'luggage_accessories': 'clothing_accessories',
#garden and pets
'flowers' : 'garden_pets',
'pet_shop' : 'garden_pets',
'garden_tools' : 'garden_pets',
#leisure_homemade
'sports_leisure' : 'leisure_homemade',
'musical_instruments': 'leisure_homemade',
'party_supplies': 'leisure_homemade',
'books' : 'leisure_homemade',
'books_imported': 'leisure_homemade',
'books_general_interest': 'leisure_homemade',
'books_technical': 'leisure_homemade',
'art': 'leisure_homemade',
'toys': 'leisure_homemade',
'cine_photo': 'leisure_homemade',
'cds_dvds_musicals': 'leisure_homemade',
'music': 'leisure_homemade',
'dvds_blu_ray': 'leisure_homemade',
'arts_and_craftmanship': 'leisure_homemade',
#other
'Unknown' : 'other',
'market_place' : 'other',
'cool_stuff' : 'other'}
# %%flake8
product_category['product_category_general'] = product_category[
'product_category_name_english'].map(dict_categories)
# %%flake8
products['volume_product'] = products['product_length_cm']*products[
'product_height_cm']*products['product_width_cm']
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("Analyze of the dataset : {}.".format(name))
print(description_var(dataset))
print("\n")
# %%flake8
customers['customer_unique_id'].nunique()
There are 96096 different customers.
# %%flake8
customers['customer_id'].nunique()
There are more consumer identifiers than unique consumer identifiers: this makes it possible to hypothesize that some consumers have made more than one order. To be checked with order.
# %%flake8
customers['customer_zip_code_prefix'].nunique()
Consumers come from 14 994 different zip codes.
# %%flake8
customer_by_state = customers[['customer_unique_id', 'customer_state']]\
.groupby('customer_state').count().reset_index()
customer_by_state = customer_by_state.sort_values(by=['customer_unique_id'],
ascending=False)
plt.figure(figsize=(15, 10))
sns.barplot(x='customer_state',
y='customer_unique_id',
data=customer_by_state)
plt.title("Distribution of customers by state")
plt.show()
With this graph, we can see that most customers are located in SP, RJ, MG. We will have to see how these initials match. SP is Sao Paulo, RJ for Rio de Janero, MG for belo horizonte.
# %%flake8
def region_feature():
regions_dict = {
"North" : set(['AC', 'AM', 'RR', 'RO', 'PA', 'AP', 'TO']),
"Northeast" : set(
['MA', 'PI', 'BA', 'CE', 'RN', 'PB', 'PE', 'AL', 'SE']
),
"Central" : set(['MT', 'GO', 'MS', 'DF']),
"Southeast" : set(['MG', 'SP', 'ES', 'RJ']),
"South" : set(['PR', 'SC', 'RS']),
}
def get_region(state):
for region, set_states in regions_dict.items():
if state in set_states:
return region
customers['customer_region'] =\
customers['customer_state'].map(get_region)
customers['customer_region'] = \
customers['customer_region'].astype('category')
customers['customer_region'].value_counts()
region_feature()
customers['customer_region']
pie_col_category(customers, 'customer_region', (10,10))
# %%flake8
geolocalisation['geolocation_state'].unique()
# %%flake8
geolocalisation['geolocation_state'].nunique()
There are a total of 27 states represented in this dataset.
# %%flake8
geolocalisation['geolocation_city'].unique()
# %%flake8
geolocalisation['geolocation_city'] = geolocalisation['geolocation_city']\
.replace(['são paulo'], 'sao paulo')
# %%flake8
geolocalisation['geolocation_city'].nunique()
There are a total of 5954 cities represented in this dataset.
# %%flake8
geolocalisation.loc[geolocalisation['geolocation_state'] == 'SP']
# %%flake8
geolocalisation.loc[geolocalisation['geolocation_state'] == 'RJ']
# %%flake8
geolocalisation.loc[geolocalisation['geolocation_state'] == 'MG']
# %%flake8
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
geometry = [Point(xy) for xy in zip(geolocalisation['geolocation_lng'],
geolocalisation['geolocation_lat'])]
gdf = GeoDataFrame(geolocalisation, geometry=geometry)
# this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
gdf.plot(ax=world.plot(figsize=(40, 30)), marker='o',
color='red', markersize=15)
With this map, we can say that almost all sellers and buyers are in Brazil and some of these people are in Portugal, Mexico that is why the shipping tax is different.
# %%flake8
geolocalisation = geolocalisation.drop(columns=['geometry'])
# %%flake8
plot_date_dist(order_items, 'shipping_limit_date')
Here we see that most of the products are no longer marketed at the end of 2017 and into 2018. Some products are available until early 2020.
# %%flake8
plot_hist_stats(order_items, 'order_item_id')
Most orders are for 1 item.
# %%flake8
order_payments['payment_value'].describe()
The average customer basket is 154 euros
# %%flake8
pie_col_category(visualization_payment, 'payment_installments', (15, 15))
# %%flake8
pie_col_category(visualization_payment, 'payment_sequential', (15, 15))
# %%flake8
hist_col_categories(order_payments, 'payment_type', (10, 10))
# %%flake8
order_reviews['review_score'] = order_reviews[
'review_score'].astype('category')
graphe_categories(order_reviews, (15, 10))
# %%flake8
order_reviews['review_score'] = order_reviews[
'review_score'].astype('int64')
# %%flake8
order_reviews2 = order_reviews.drop(['review_id', 'order_id',
'review_creation_date',
'review_answer_timestamp'], axis=1)
# %%flake8
graphe_objet(order_reviews2)
Here we can see that the vast majority of orders are positive.
# %%flake8
plot_date_dist(order_reviews, 'review_creation_date')
Here, we can see that the majority of comments were given in 2018.
# %%flake8
plot_date_dist(order_reviews, 'review_answer_timestamp')
# %%flake8
orders.order_id.nunique()
There were a total of 99,441 orders in 2 years.
# %%flake8
nb_customer = customers['customer_unique_id'].nunique()
nb_orders = orders.order_id.nunique() # nombre de commande
mean_order = nb_orders/nb_customer
mean_order
Here, we can say that some buyers make more than one order on the site.
# %%flake8
hist_col_categories(orders, 'order_status', (10, 10))
# %%flake8
orders['order_purchase_timestamp'].max() # .MAXYEAR
# %%flake8
orders['order_purchase_timestamp'].min() # .MINYEAR
We have the data for orders that span from 2016 to 2018 which is 2 years.
plot_date_dist(orders, 'order_purchase_timestamp')
# %%flake8
products2 = products.drop(['product_id'], axis=1)
# %%flake8
graphe_objet(products2)
"""# %%flake8
products['product_category_name'] = products[
'product_category_name'].astype('category')"""
# %%flake8
sellers['seller_id'].nunique()
There are 3095 different sellers.
# %%flake8
sellers['seller_zip_code_prefix'].nunique()
Consumers come from 2 246 different zip codes.
# %%flake8
sellers_by_state = sellers[['seller_id', 'seller_state']]\
.groupby('seller_state').count().reset_index()
sellers_by_state = sellers_by_state.sort_values(by=['seller_id'],
ascending=False)
plt.figure(figsize=(15, 10))
sns.barplot(x='seller_state',
y='seller_id',
data=sellers_by_state)
plt.title("Distribution of sellers by state")
plt.show()
With this graph, we can see that most customers are located in SP, PR, MG. We will have to see how these initials match. SP is Sao Paulo, PR for curitiba, MG for belo horizonte.
# %%flake8
geolocalisation.loc[geolocalisation['geolocation_state'] == 'PR']
# %%flake8
def region_feature():
regions_dict = {
"North" : set(['AC', 'AM', 'RR', 'RO', 'PA', 'AP', 'TO']),
"Northeast" : set(
['MA', 'PI', 'BA', 'CE', 'RN', 'PB', 'PE', 'AL', 'SE']
),
"Central" : set(['MT', 'GO', 'MS', 'DF']),
"Southeast" : set(['MG', 'SP', 'ES', 'RJ']),
"South" : set(['PR', 'SC', 'RS']),
}
def get_region(state):
for region, set_states in regions_dict.items():
if state in set_states:
return region
sellers['seller_region'] =\
sellers['seller_state'].map(get_region)
sellers['seller_region'] = \
sellers['seller_region'].astype('category')
sellers['seller_region'].value_counts()
region_feature()
sellers[sellers['seller_region'].isna()]
pie_col_category(sellers, 'seller_region', (10,10))
Creation of a dataframe to assign to each product its category in English form.
# %%flake8
produc_cat = pd.merge(products, product_category).drop(
['product_category_name'], axis=1)
# %%flake8
product_by_category = produc_cat[['product_id',
'product_category_name_english']].groupby(
'product_category_name_english').count(
).reset_index()
product_by_category = product_by_category.sort_values(by=['product_id'],
ascending=False).head(10)
plt.figure(figsize=(14, 10))
sns.barplot(x='product_category_name_english',
y='product_id',
data=product_by_category)
plt.xticks(rotation=80, fontsize=18)
plt.tight_layout()
plt.title("Top 10 of distribution of product", fontsize=22)
plt.xlabel(" ")
plt.ylabel(" ")
plt.show()
Distribution of the products present on the site according to their main categories.
product_category['product_category_general'].value_counts().plot.pie(figsize=(20, 20),
autopct='%.2f%%',
textprops={'fontsize': 14})
ax1 = plt.axes()
y_axis = ax1.axes.get_yaxis()
y_axis.set_visible(False)
plt.show()
plt.close()
Now that we know more about the nine CSVs that make up the database, we can say that they are all linked together by the following relationship scheme:
# %%flake8
left_df = products
right_df = product_category
products = pd.merge(left_df, right_df, on='product_category_name', how='left')
#products = products.drop(
# ['product_category_name'], axis=1)
products
# %%flake8
left_df = sellers
right_df = geolocalisation
sellers = pd.merge(left_df, right_df, left_on='seller_zip_code_prefix',
right_on='geolocation_zip_code_prefix', how='left')
sellers
# %%flake8
# Rename columns of geolocalisation to be more explicit
sellers = sellers.rename(columns={
"geolocation_lat_means": "seller_lat",
"geolocation_lng_means": "seller_lng",
})
# Drop duplicates informations
sellers = sellers.drop(columns=[
'geolocation_city',
'geolocation_state',
'geolocation_zip_code_prefix'
])
sellers
# %%flake8
left_df = customers
right_df = geolocalisation
customers = pd.merge(left_df, right_df, left_on='customer_zip_code_prefix',
right_on='geolocation_zip_code_prefix', how='left')
customers
# %%flake8
# Rename columns of geolocalisation to be more explicit
customers = customers.rename(columns={
'geolocation_lat_means': 'customer_lat',
'geolocation_lng_means': 'customer_lng'
})
# Drop duplicates informations
customers = customers.drop(columns=[
'geolocation_city',
'geolocation_state',
'geolocation_zip_code_prefix'
])
# %%flake8
left_df = order_items
right_df = products
order_items = pd.merge(left_df, right_df, on='product_id', how='left')
order_items
# %%flake8
left_df = order_items
right_df = sellers
order_items = pd.merge(left_df, right_df, on='seller_id', how='left')
order_items
# %%flake8
left_df = orders
right_df = order_payments
orders = pd.merge(left_df, right_df, on='order_id', how='left')
orders
# %%flake8
left_df = orders
right_df = order_reviews
orders = pd.merge(left_df, right_df, on='order_id', how='left')
orders
# %%flake8
left_df = orders
right_df = order_items
orders = pd.merge(left_df, right_df, on='order_id', how='left')
orders
orders[orders['seller_region'].isna()]
# %%flake8
left_df = customers
right_df = orders
bd_olist = pd.merge(left_df, right_df, on='customer_id', how='left')
bd_olist
bd_olist[bd_olist['seller_region'].isna()]
del bd_olist['geolocation_lat_x']
del bd_olist['geolocation_lng_x']
del bd_olist['geolocation_lat_y']
del bd_olist['geolocation_lng_y']
bd_olist
column creation to perform RFM segmentation. It analyzes customers' behavior on three parameters: Recency: How recent is the last purchase of the customer. Frequency: How often the customer makes a purchase. Monetary: How much money does the customer spends.
number_order = pd.DataFrame()
# %%flake8
number_order['number_order'] = customers[
'customer_unique_id'].value_counts()
number_order = number_order.reset_index()
number_order
number_order = number_order.rename(columns={"index": "customer_unique_id"})
number_order
# %%flake8
left_df = bd_olist
right_df = number_order
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
distance_sc = bd_olist[['customer_unique_id', 'order_id']].copy()
# %%flake8
distance_sc['distance_between_seller_customer']=((bd_olist ['customer_lat']-bd_olist[
'seller_lat'])**2 + (bd_olist['customer_lng']- bd_olist[
'seller_lng'])**2)**(1/2)
distance_sc
distance_sc['distance_between_seller_customer'].fillna(0,inplace=True)
dist_custom_seller = distance_sc.groupby(['customer_unique_id','order_id'])[
'distance_between_seller_customer'].mean().groupby(
['customer_unique_id']).mean()
distance_seller_customer = dist_custom_seller.to_frame()
distance_seller_customer = distance_seller_customer.reset_index()
distance_seller_customer
# %%flake8
left_df = bd_olist
right_df = distance_seller_customer
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
bd_olist['order_purchase_timestamp'].unique()
# %%flake8
first_order = pd.DataFrame()
first_order = bd_olist[['customer_unique_id','order_id', 'order_purchase_timestamp']].copy()
first_order['date_first_order'] = first_order['order_purchase_timestamp'].apply(lambda x: x.date())
first_order
first_order_date = first_order[['customer_unique_id','order_id', 'date_first_order']].groupby(
['customer_unique_id','order_id'])['date_first_order'].min().groupby('customer_unique_id').min()
first_order_date
first_order_date = first_order_date.to_frame()
first_order_date
first_order_date = first_order_date.reset_index()
first_order_date
# %%flake8
left_df = bd_olist
right_df = first_order_date
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
last_order = pd.DataFrame()
last_order = bd_olist[['customer_unique_id',
'order_id', 'order_purchase_timestamp']].copy()
# %%flake8
last_order['date_last_order'] = last_order[
'order_purchase_timestamp'].apply(lambda x: x.date())
last_order
# %%flake8
last_order_date = last_order[['customer_unique_id',
'order_id', 'date_last_order']].groupby(
['customer_unique_id', 'order_id'])['date_last_order'].max().groupby(
'customer_unique_id').max()
last_order_date
# %%flake8
last_order_date = last_order_date.to_frame()
last_order_date
# %%flake8
last_order_date = last_order_date.reset_index()
last_order_date
# %%flake8
left_df = bd_olist
right_df = last_order_date
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
between_orders = pd.DataFrame()
between_orders = bd_olist[['customer_unique_id',
'order_id', 'order_purchase_timestamp', 'date_last_order']].copy()
between_orders['order_purchase_timestamp'] = between_orders['order_purchase_timestamp'].apply(lambda x: x.date())
# %%flake8
between_orders['mean_days_between_orders'] = (between_orders['date_last_order'] - between_orders[
'order_purchase_timestamp']).apply(lambda x: x.days)
between_orders
# %%flake8
between_orders = between_orders[['customer_unique_id',
'order_id', 'mean_days_between_orders']].groupby(
['customer_unique_id', 'order_id'])['mean_days_between_orders'].mean().groupby(
'customer_unique_id').mean()
between_orders
# %%flake8
between_orders = between_orders.to_frame()
between_orders
# %%flake8
between_orders = between_orders.reset_index()
between_orders
# %%flake8
left_df = bd_olist
right_df = between_orders
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
bd_olist['order_purchase_timestamp'].max()
# %%flake8
days_since_last_purchase = pd.DataFrame()
days_since_last_purchase = bd_olist[['customer_unique_id',
'order_id', 'order_purchase_timestamp', 'date_last_order']].copy()
days_since_last_purchase['order_purchase_timestamp'] = days_since_last_purchase['order_purchase_timestamp'].apply(lambda x: x.date())
days_since_last_purchase['last_purchase'] = days_since_last_purchase['order_purchase_timestamp'].max()
days_since_last_purchase
# %%flake8
days_since_last_purchase['days_last_purchase'] = (days_since_last_purchase['last_purchase'] - days_since_last_purchase[
'date_last_order']).apply(lambda x: x.days)
days_since_last_purchase
# %%flake8
days_since_last_purchase = days_since_last_purchase[['customer_unique_id',
'order_id', 'days_last_purchase']].groupby(
['customer_unique_id', 'order_id'])['days_last_purchase'].mean().groupby(
'customer_unique_id').mean()
days_since_last_purchase
# %%flake8
days_since_last_purchase = days_since_last_purchase.to_frame()
# %%flake8
days_since_last_purchase = days_since_last_purchase.reset_index()
# %%flake8
left_df = bd_olist
right_df = days_since_last_purchase
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
product_per_orders = pd.DataFrame()
product_per_orders = bd_olist[['customer_unique_id',
'order_id', 'order_item_id']].copy()
# %%flake8
nb_pdt_com_moy = product_per_orders.groupby(['customer_unique_id',
'order_id'])[
'order_item_id'].max()\
.groupby(['customer_unique_id'])\
.mean()
# %%flake8
nb_pdt_com_moy = nb_pdt_com_moy.to_frame()
nb_pdt_com_moy
# %%flake8
nb_pdt_com_moy = nb_pdt_com_moy.reset_index()
nb_pdt_com_moy
nb_pdt_com_moy.rename(columns={'order_item_id': 'mean_item_per_order'}, inplace=True)
nb_pdt_com_moy['mean_item_per_order'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = nb_pdt_com_moy
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
product_tot = pd.DataFrame()
product_tot = bd_olist[['customer_unique_id',
'order_id', 'order_item_id']].copy()
product_tot
# %%flake8
nb_product_tot = product_tot.groupby(['customer_unique_id'])[
'order_item_id'].sum()\
.groupby(['customer_unique_id'])\
.sum()
nb_product_tot
# %%flake8
nb_product_tot = nb_product_tot.to_frame()
nb_product_tot
# %%flake8
nb_product_tot = nb_product_tot.reset_index()
nb_product_tot
nb_product_tot.rename(columns={'order_item_id': 'total_nb_item_ordered'}, inplace=True)
# %%flake8
left_df = bd_olist
right_df = nb_product_tot
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
product_vol = pd.DataFrame()
product_vol = bd_olist[['customer_unique_id',
'order_id', 'volume_product']].copy()
product_vol
# %%flake8
vol_product = product_vol.groupby(['customer_unique_id'])[
'volume_product'].mean()\
.groupby(['customer_unique_id'])\
.mean()
vol_product
# %%flake8
vol_product = vol_product.to_frame()
vol_product
# %%flake8
vol_product = vol_product.reset_index()
vol_product
vol_product['volume_product'].fillna(0,inplace=True)
vol_product.rename(columns={'volume_product': 'mean_volume_item_ordered'}, inplace=True)
# %%flake8
left_df = bd_olist
right_df = vol_product
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
mean_price_product = pd.DataFrame()
mean_price_product = bd_olist[['customer_unique_id',
'order_id', 'price']].copy()
# %%flake8
price_product_mean = mean_price_product.groupby(['customer_unique_id',
'order_id'])['price']\
.mean()\
.groupby(['customer_unique_id'])\
.mean()
price_product_mean
# %%flake8
price_product_mean = price_product_mean.to_frame()
price_product_mean
# %%flake8
price_product_mean = price_product_mean.reset_index()
price_product_mean
# %%flake8
price_product_mean.rename(columns={
'price': 'mean_product_price'}, inplace=True)
# %%flake8
price_product_mean['mean_product_price'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = price_product_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
mean_freight_product = pd.DataFrame()
mean_freight_product = bd_olist[['customer_unique_id',
'order_id', 'freight_value']].copy()
# %%flake8
freight_product_mean = mean_freight_product.groupby(['customer_unique_id',
'order_id'])['freight_value']\
.mean()\
.groupby(['customer_unique_id'])\
.mean()
freight_product_mean
# %%flake8
freight_product_mean = freight_product_mean.to_frame()
freight_product_mean
# %%flake8
freight_product_mean = freight_product_mean.reset_index()
freight_product_mean
# %%flake8
freight_product_mean.rename(columns={
'freight_value': 'mean_freight_value'}, inplace=True)
# %%flake8
freight_product_mean['mean_freight_value'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = freight_product_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
mean_total_value = pd.DataFrame()
mean_total_value = bd_olist[['customer_unique_id',
'order_id', 'payment_value']].copy()
mean_total_value
# %%flake8
total_value_mean = mean_total_value.groupby(['customer_unique_id',
'order_id'])['payment_value']\
.mean()\
.groupby(['customer_unique_id'])\
.mean()
total_value_mean
# %%flake8
total_value_mean = total_value_mean.to_frame()
total_value_mean
# %%flake8
total_value_mean = total_value_mean.reset_index()
total_value_mean
# %%flake8
total_value_mean.rename(columns={
'payment_value': 'mean_order_value'}, inplace=True)
total_value_mean
# %%flake8
total_value_mean['mean_order_value'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = total_value_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
max_total_value = pd.DataFrame()
max_total_value = bd_olist[['customer_unique_id',
'order_id', 'payment_value']].copy()
max_total_value
# %%flake8
total_value_max = max_total_value.groupby(['customer_unique_id',
'order_id'])['payment_value']\
.max()\
.groupby(['customer_unique_id'])\
.max()
total_value_max
# %%flake8
total_value_max = total_value_max.to_frame()
total_value_max
# %%flake8
total_value_max = total_value_max.reset_index()
total_value_max
# %%flake8
total_value_max.rename(columns={
'payment_value': 'max_order_value'}, inplace=True)
total_value_max
# %%flake8
total_value_max['max_order_value'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = total_value_max
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
min_total_value = pd.DataFrame()
min_total_value = bd_olist[['customer_unique_id',
'order_id', 'payment_value']].copy()
min_total_value
# %%flake8
total_value_min = min_total_value.groupby(['customer_unique_id',
'order_id'])['payment_value']\
.min()\
.groupby(['customer_unique_id'])\
.min()
total_value_min
# %%flake8
total_value_min = total_value_min.to_frame()
total_value_min
# %%flake8
total_value_min = total_value_min.reset_index()
total_value_min
# %%flake8
total_value_min.rename(columns={
'payment_value': 'min_order_value'}, inplace=True)
total_value_min
# %%flake8
total_value_min['min_order_value'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = total_value_min
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
bd_olist['review_score'] = bd_olist[
'review_score'].astype('int')
# %%flake8
mean_score_review = pd.DataFrame()
mean_score_review = bd_olist[['customer_unique_id',
'order_id', 'review_score']].copy()
mean_score_review
# %%flake8
score_review_mean = mean_score_review.groupby(['customer_unique_id',
'order_id'])\
['review_score'].mean()\
.groupby('customer_unique_id')\
.mean()
score_review_mean
# %%flake8
score_review_mean = score_review_mean.to_frame()
score_review_mean
# %%flake8
score_review_mean = score_review_mean.reset_index()
score_review_mean
# %%flake8
score_review_mean.rename(columns={
'review_score': 'mean_review_score'}, inplace=True)
score_review_mean
# %%flake8
score_review_mean['mean_review_score'].fillna(0,inplace=True)
# %%flake8
left_df = bd_olist
right_df = score_review_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
bd_olist['payment_sequential'].unique()
bd_olist['payment_sequential'].fillna(0,inplace=True)
# %%flake8
mean_payment_sequential = pd.DataFrame()
mean_payment_sequential = bd_olist[['customer_unique_id',
'order_id', 'payment_sequential']].copy()
mean_payment_sequential
# %%flake8
payment_sequential_mean = mean_payment_sequential.groupby(['customer_unique_id',
'order_id'])['payment_sequential']\
.mean().round(0)\
.groupby(['customer_unique_id'])\
.mean().round(0)
payment_sequential_mean
# %%flake8
payment_sequential_mean = payment_sequential_mean.to_frame()
payment_sequential_mean
# %%flake8
payment_sequential_mean = payment_sequential_mean.reset_index()
payment_sequential_mean
# %%flake8
payment_sequential_mean.rename(columns={
'payment_sequential': 'mean_payment_sequential'}, inplace=True)
payment_sequential_mean
# %%flake8
left_df = bd_olist
right_df = payment_sequential_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
bd_olist['payment_installments'].unique()
# %%flake8
bd_olist['payment_installments'].fillna(0,inplace=True)
# %%flake8
mean_payment_installment = pd.DataFrame()
mean_payment_installment = bd_olist[['customer_unique_id',
'order_id', 'payment_installments']].copy()
mean_payment_installment
# %%flake8
payment_installment_mean = mean_payment_installment.groupby(['customer_unique_id',
'order_id'])['payment_installments']\
.mean().round(0)\
.groupby(['customer_unique_id'])\
.mean().round(0)
payment_installment_mean
# %%flake8
payment_installment_mean = payment_installment_mean.to_frame()
payment_installment_mean
# %%flake8
payment_installment_mean = payment_installment_mean.reset_index()
payment_installment_mean
# %%flake8
payment_installment_mean.rename(columns={
'payment_installments': 'mean_payment_installment'}, inplace=True)
payment_installment_mean
# %%flake8
left_df = bd_olist
right_df = payment_installment_mean
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
bd_olist['payment_type'].fillna('Unknown',inplace=True)
# %%flake8
most_payment = pd.DataFrame()
most_payment = bd_olist[['customer_unique_id',
'payment_type']].copy()
most_payment
most_payment['payment_type'].unique()
def mode_perso(serie_values):
#En entrée une serie en sortie une valeur de l'agregation de cette série
count = serie_values.value_counts()
return count.idxmax()
most_payment_mode = most_payment.groupby(by="customer_unique_id").agg(mode_perso)
most_payment_mode
# %%flake8
most_payment_mode = most_payment_mode.reset_index()
most_payment_mode
# %%flake8
most_payment_mode.rename(columns={
'payment_type': 'most_payment_used'}, inplace=True)
most_payment_mode
# %%flake8
left_df = bd_olist
right_df = most_payment_mode
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
# %%flake8
bd_olist['product_category_general'].fillna('other',inplace=True)
# %%flake8
cat_most_purchased = pd.DataFrame()
cat_most_purchased = bd_olist[['customer_unique_id',
'product_category_general']].copy()
cat_most_purchased
cat_most_purchased['product_category_general'].unique()
"""# %%flake8
# Avoids errors if two customers have two preferred payments
cat_most_purchased = cat_most_purchased.groupby(['customer_unique_id',
'product_category_general'])\
.agg(lambda x: x.value_counts(
).index[0])
cat_most_purchased
# most_purchased_cat = most_purchased_cat.drop(cat_most_purchased.columns[2], axis=1)
"""
def mode_perso(serie_values):
#En entrée une serie en sortie une valeur de l'agregation de cette série
count = serie_values.value_counts()
return count.idxmax()
most_purchased_cat = cat_most_purchased.groupby(by="customer_unique_id").agg(mode_perso)
most_purchased_cat
# %%flake8
most_purchased_cat = most_purchased_cat.reset_index()
most_purchased_cat
# %%flake8
most_purchased_cat.rename(columns={
'product_category_general': 'category_most_purchased'}, inplace=True)
most_purchased_cat
# %%flake8
left_df = bd_olist
right_df = most_purchased_cat
bd_olist = pd.merge(left_df, right_df, on='customer_unique_id', how='left')
bd_olist
table_cat = bd_olist[['customer_unique_id', 'price', 'product_category_general']].copy()
table_cat
table_cat = pd.pivot_table(table_cat,
index='customer_unique_id',
columns = 'product_category_general',
values=['price'],
aggfunc=np.sum,
fill_value=0).reset_index().swaplevel(0,1,axis=1)
table_cat.columns =[str(s2) + '_for_' + s1 for (s1,s2) in table_cat.columns.tolist()]
table_cat = table_cat.rename(columns={"customer_unique_id_for_": "customer_unique_id"})
# %%flake8
left_df = bd_olist
right_df = table_cat
bd_olist = pd.merge(left_df, right_df, left_on = 'customer_unique_id',
right_on = 'customer_unique_id')
bd_olist
Number of days from 01/01/01.
bd_olist['date_first_order'] = bd_olist['date_first_order']\
.apply(lambda x:x.toordinal())
bd_olist['date_last_order'] = bd_olist['date_last_order']\
.apply(lambda x:x.toordinal())
bd_olist.columns
"""columns to remove :
'customer_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state', 'customer_lat',
'customer_lng','order_id','order_status','order_purchase_timestamp','order_approved_at',
'order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date',
'payment_sequential','payment_type','payment_installments','payment_value','review_id','review_score',
'review_comment_title','review_comment_message','review_creation_date','review_answer_timestamp',
'order_item_id', 'product_id','seller_id','shipping_limit_date','price','freight_value','product_category_name',
'product_name_lenght','product_description_lenght','product_photos_qty','product_weight_g','product_length_cm',
'product_height_cm','product_width_cm', 'volume_product','product_category_name_english','product_category_general',
'seller_zip_code_prefix','seller_city','seller_state','seller_lat','seller_lng', 'seller_region', month_first_order, year_first_order,
year_last_order, month_last_order, 'max_order_value', 'min_order_value', 'total_nb_item_ordered', """
# %%flake8
def remove_columns(dataframe) :
new = pd.DataFrame()
#colonnes = dataframe.columns.tolist()
columns_to_keep = [
'customer_unique_id', 'customer_region',
'number_order', 'distance_between_seller_customer',
'date_last_order', 'mean_days_between_orders', 'mean_item_per_order',
'mean_volume_item_ordered', 'date_first_order',
'mean_product_price', 'mean_freight_value', 'mean_order_value',
'mean_review_score',
'mean_payment_sequential', 'mean_payment_installment',
'most_payment_used', 'category_most_purchased',
'price_for_Office_equipment_furniture',
'price_for_clothing_accessories', 'price_for_electronic_equipment',
'price_for_food_drink', 'price_for_furniture_decoration_home',
'price_for_garden_pets', 'price_for_home_appliances',
'price_for_hygiene_health_wellness', 'price_for_leisure_homemade',
'price_for_other', 'price_for_repairs_constructions']
for column in columns_to_keep:
try:
new[column]=dataframe[column]
except:
print('...colonne non présente : ', column)
print('\n')
print("All selected columns have been kept from the dataset")
return new
# %%flake8
data = remove_columns(bd_olist)
# %%flake8
data.shape
I decided to keep 31 of the 68 columns for further analysis.
list(data.columns)
customer_unique_id : unique identifier of a customer
customer_region : customer region in Brazil
seller_region : customer region in Brazil
distance_between_seller_customer : Distance between the residence of the customer and the seller
number_order : number of orders
date_first_order : date of first order
date_last_order : date of last order
days_since_lastorder : number of days between the last order and today
mean_days_between_orders : average time (in days) between two orders
mean_item_per_order : average number of products per order
total_nb_item_ordered : total number of products ordered by a customer
mean_volume_item_ordered : average volume of products ordered by a customer
mean_product_price: average price of products ordered
mean_freight_value : average cost of shipping
mean_order_value : average price of the total order
max_order_value : max price of the total order
min_order_value : min price of the total order
mean_review_score : average score given to ordered products
mean_payment_sequential : number of payment methods used to pay for orders
mean_payment_installment : average number of installments per order
most_payment_used : Most used means of payment
category_most_purchased : Most purchased product category
data.dtypes
data_fin = duplicate(data, 'customer_unique_id')
# %%flake8
data_fin.shape
I removed 21363 duplicates lines.
data_fin.head()
# %%flake8
data_final = filtration_columns(data_fin, 0.50)
The selected column fill rate is 50% in order to have as much real data as possible as well as useful variables for the exploration.
# %%flake8
datasets = [data_final]
list_names = ['bd_finale_olist']
resume = []
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("General presentation of the dataset {}." .format(name))
informations(dataset)
resume.append(
{
'Dataset': name,
'Lines': dataset.shape[0],
'Columns': dataset.shape[1]
}
)
pd.DataFrame(resume)
This final CSV, regrouping all the tables seen previously, is composed of 96096 lines for 20 columns.
data_final.head()
data_final.shape
# %%flake8
data_final.isna().sum()
# %%flake8
pie_NaN(data_final, (10, 10))
# %%flake8
msno.matrix(data_final)
There is no missing data.
data_final.dtypes
# %%flake8
plot_multiple_histograms(data_final, (28, 18))
print("Verification of the normality of the numerical columns of the final dataset.")
verif_normalite(data_final)
The variables do not have a Normal distribution so the Outliers are removed by the Percentile method.
# %%flake8
delete_outliers_percentiles(data_final)
# %%flake8
plot_multiple_histograms(data_final, (28, 18))
# # %%flake8
graph_fill_variable(data_final, (5, 10))
Overall, we can say that the majority of the columns are completely filled. Just two columns title and message have many missig value.
# %%flake8
datasets = [data_final]
list_names = ['bd_finale_olist']
resume = []
for name in list_names:
pos = list_names.index(name)
dataset = datasets[pos]
print("General presentation of the dataset {}." .format(name))
informations(dataset)
resume.append(
{
'Dataset': name,
'Lines': dataset.shape[0],
'Columns': dataset.shape[1]
}
)
pd.DataFrame(resume)
# %%flake8
data_final.dtypes
# %%flake8
plt.figure(figsize=(30, 15))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(data_final.corr(), dtype=np.bool))
heatmap = sns.heatmap(data_final.corr(), mask=mask, vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation matrix between the different variables of the '
'dataset', fontdict={'fontsize': 18}, pad=16)
data_final.columns
The correlation coefficient is between -1 and 1. The closer the coefficient is to 1, the stronger the positive linear relationship between the variables. The closer the coefficient is to -1, the stronger the negative linear relationship between the variables. The closer the coefficient is to 0, the weaker the linear relationship between the variables.
To determine which variables are strongly related to each other, a limit of 0.20 for the correlation coefficient was arbitrarily chosen.
Analysis of the relationships between certains variables:
data_final.shape
# %%flake8
subset = data_final.sample(n=5000)
# %%flake8
scatter_plot(subset, "distance_between_seller_customer", "mean_freight_value", data_final)
Thanks to this graph, we can see that there is a relationship between the distance seller-buyer and the shipping costs. It is possible to say that the greater the distance between a seller and a buyer, the higher the price of shipping costs will be.
# %%flake8
scatter_plot(subset, "mean_item_per_order", "mean_order_value", data_final)
Thanks to this graph, we can see that there is a relationship between the average number of products per order and the value of each order. It is possible to say that the more products there are in the order, the higher the price of the order will be.
# %%flake8
scatter_plot(subset, "mean_volume_item_ordered", "mean_product_price", data_final)
Thanks to this graph, we can see that there is a relationship between the volume of a product and its price. It is possible to say that the more a product is voluminous, the higher its price will be.
# %%flake8
scatter_plot(subset, "mean_volume_item_ordered", "mean_freight_value", data_final)
Thanks to this graph, we can see that there is a relationship between the volume of a product and its shipping costs. It is possible to say that the more a product is voluminous, the higher its shipping costs will be.
# %%flake8
scatter_plot(subset, "mean_volume_item_ordered", "mean_order_value", data_final)
Thanks to this graph, we can see that there is a relationship between the volume of a product and the price of the order. It is possible to say that the more a product is voluminous, the more the price of the order will be high.
# %%flake8
scatter_plot(subset, "mean_product_price", "mean_freight_value", data_final)
Thanks to this graph, we can see that there is a relationship between the price of a product and the shipping costs. It is possible to say that the more expensive a product is, the higher the shipping costs will be.
# %%flake8
scatter_plot(subset, "mean_product_price", "mean_order_value", data_final)
Thanks to this graph, we can see that there is a relationship between the price of a product and the total price of the order. It is possible to say that the more expensive a product is, the higher the price of the order will be.
# %%flake8
scatter_plot(subset, "mean_product_price", "mean_payment_installment", data_final)
Thanks to this graph, we can see that there is a relationship between the price of a product and the total price of the order. It is possible to say that the more expensive a product is, the higher the number of payments will be.
# %%flake8
scatter_plot(subset, "mean_freight_value", "mean_product_price", data_final)
Thanks to this graph, we can see that there is a relationship between the shipping costs and the price of a product. We can say that the higher the shipping costs are, the more expensive the product will be.
# %%flake8
scatter_plot(subset, "mean_freight_value", "mean_order_value", data_final)
Thanks to this graph, we can see that there is a relationship between the shipping costs and the price of a product. We can say that the higher the shipping costs are, the higher the price of the order will be.
# %%flake8
scatter_plot(subset, "mean_freight_value", "mean_payment_installment", data_final)
Thanks to this graph, we can see that there is a relationship between the shipping costs and tthe number of payments. We can say that the higher the shipping costs are, the higher the number of payments will be.
# %%flake8
scatter_plot(subset, "mean_order_value", "mean_payment_installment", data_final)
Thanks to this graph, we can see that there is a relationship between the total price of the order and the price of a product. We can say that the higher the total price of the order, the higher the number of payments will be.
# %%flake8
# sur google colab
from google.colab import drive
drive.mount('/content/drive/')
data_final.to_csv('bd_olist_finale.csv', index=False)
!cp bd_olist_finale.csv /content/drive/My\ Drive/
"""# %%flake8
data_final.to_csv('csv_produits/bd_olist_finale.csv', index=False)"""
data_final.head()
data_final.dtypes
data_final.shape